import org.apache.spark.{SparkConf,SparkContext}
object test {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("test")
    val sc = new SparkContext(conf)
    val data = sc.textFile("./in/*")
    val da = data.distinct.map(x=>x.split(",")).filter(x=>x.length==16)
    //    println(da.count)
    //    da.map(_.mkString(",")).foreach(x=>println(x))
    val data1 = da.filter(x=>(x(12).contains("发布")))//发布时间
    //                    .filter(x=>x(11).contains("人"))//招收人数
    //                    .filter(x => (!x(8).contains("人")))//学历
    //                     .filter(x => (x(9).contains("生"))||(x(5).contains("经验")))//经验
    //                    .filter(x => (!(x(6).equals(""))))

    //    println(data1.count)
    //    data1.map(x=>(x(12),1)).reduceByKey(_+_).sortBy(x=>x._2,false).take(10000).foreach(x=>println(x))
    val data2 = data1.map(x=>{
      if(x(2).contains("-"))
        (x(0),x(1),x(2).substring(0,x(2).indexOf("-")),x(3),x(4),x(5),x(6),x(7),x(8),x(9),x(10),x(11).replace("招","").replace("人","").replace("若干","3"),x(12),x(13),x(14),x(15))
      else
        (x(0),x(1),x(2),x(3),x(4),x(5),x(6),x(7),x(8),x(9),x(10),x(11).replace("招","").replace("人","").replace("若干","3"),x(12),x(13),x(14),x(15))
    })
    //    println(data2.count)

    val data3 = data2.map(x=>{
      if(x._14.contains("-")) {
        if (x._14.contains("千/月")) {
          val start = (x._14.substring(0, x._14.indexOf("-"))).toDouble * 1000
          val end = (x._14.substring(x._14.lastIndexOf("-")+1).replace("千/月","")).toDouble * 1000
          (x._1,x._2,x._3,x._4,x._5,x._6,x._7,x._8,x._9,x._10,x._11,x._12,x._13,((start+end)/2).toInt,x._15,x._16)
        } else if(x._14.contains("万/月")){
          val start = (x._14.substring(0, x._14.indexOf("-"))).toDouble*10000
          val end = (x._14.substring(x._14.lastIndexOf("-")+1).replace("万/月","")).toDouble * 10000
          (x._1,x._2,x._3,x._4,x._5,x._6,x._7,x._8,x._9,x._10,x._11,x._12,x._13,((start+end)/2).toInt,x._15,x._16)
        }else if(x._14.contains("万/年")){
          val start = (x._14.substring(0, x._14.indexOf("-"))).toDouble*10000/12
          val end = (x._14.substring(x._14.lastIndexOf("-")+1).replace("万/年","")).toDouble * 10000/12
          (x._1,x._2,x._3,x._4,x._5,x._6,x._7,x._8,x._9,x._10,x._11,x._12,x._13,((start+end)/2).toInt,x._15,x._16)
        }else{
          (x._1,x._2,x._3,x._4,x._5,x._6,x._7,x._8,x._9,x._10,x._11,x._12,x._13,x._14,x._15,x._16)
        }
      }else {
        if(x._14.contains("元/小时")||x._14.contains("元/天"))
          (x._1,x._2,x._3,x._4,x._5,x._6,x._7,x._8,x._9,x._10,x._11,x._12,x._13,"",x._15,x._16)
        else if (x._14.contains("万以上/月"))
          (x._1,x._2,x._3,x._4,x._5,x._6,x._7,x._8,x._9,x._10,x._11,x._12,x._13,(x._14.replace("万以上/月","").toDouble*10000).toInt+1,x._15,x._16)
        else if (x._14.contains("万以下/年"))
          (x._1,x._2,x._3,x._4,x._5,x._6,x._7,x._8,x._9,x._10,x._11,x._12,x._13,(x._14.replace("万以下/年","").toDouble*10000/12).toInt-1,x._15,x._16)
        else if (x._14.contains("千以下/月"))
          (x._1,x._2,x._3,x._4,x._5,x._6,x._7,x._8,x._9,x._10,x._11,x._12,x._13,(x._14.replace("千以下/月","").toDouble*1000).toInt-1,x._15,x._16)
        else if (x._14.contains("万以上/年"))
          (x._1,x._2,x._3,x._4,x._5,x._6,x._7,x._8,x._9,x._10,x._11,x._12,x._13,(x._14.replace("万以上/年","").toDouble*10000/12).toInt+1,x._15,x._16)
        else
          (x._1,x._2,x._3,x._4,x._5,x._6,x._7,x._8,x._9,x._10,x._11,x._12,x._13,x._14,x._15,x._16)
      }
    }).filter(x=>(!x._14.equals("")))
    println(data3.count)
    //    data3.take(100).foreach(x=>println(x))
    data3.map(x=>{
      x._1+","+x._2+","+x._3+","+x._4+","+x._5+","+x._6+","+x._7+","+x._8+","+x._9+","+x._10+","+x._11+","+x._12+","+x._13+","+x._14+","+x._15+","+x._16
    }).repartition(1).saveAsTextFile("./out1")

  }
}
